This document describes the steps, code, and files used to generate the ~50kb TwinStrand sequencing panel to study the genetics of male infertility. Specifically, we are targeting DNA damage repair genes, known cancer genes, AZF genes, and genes involved in clonal spermatogenesis.
This is the script used to generate a targeted sequencing panel for the spermseq male infertility project. )
Step 1: Copy the gene list from the spreadsheet and save as a 1-column text file (target_genes.txt) Step 2: Run below script to save into kingspeak UNIX environment
## Change working directory
cd ~/Google\ Drive/Quinlan\ Lab\ -\ PhD/Projects/spermseq/twinstrand_target_gene_set
## Sort and unique
cat target_genes.txt | sort | uniq > target_genes_v1.txt
## Copy to kingspeak directory
scp target_genes_v1.txt u1240855@kingspeak.chpc.utah.edu:~/spermseq/twinstrand_targets
Use the build 38 GTF file to get other identifying information for each gene_name of interest. Look only at genes/regions that are coding sequences (hence grep -w “CDS” in the below command…)
## Define directory
cd ~/spermseq/twinstrand_target
## Download GTF file
wget ftp://ftp.ensembl.org/pub/release-102/gtf/homo_sapiens/Homo_sapiens.GRCh38.102.gtf.gz | zless | grep -w "CDS" > Homo_sapiens.GRCh38.102.CDS.gtf
scp u1240855@kingspeak.chpc.utah.edu:~/spermseq/twinstrand_targets/Homo_sapiens.GRCh38.102.CDS.gtf ~/Google\ Drive/Quinlan\ Lab\ -\ PhD/Projects/spermseq/twinstrand_target_gene_set/data
## Get the gene_ids of the target genes of interest
cat target_genes_v1.txt | awk '{print "cat Homo_sapiens.GRCh38.102.CDS.gtf | grep \"gene_name \\\""$1"\\\"\""}' | bash | cut -f 9 | awk 'BEGIN{FS = "\"; "} ; {print $0}' | sed 's/\"; /|/g' | sed 's/;//g' | sed 's/ \"/=/g' > target_gene_list_gtf_identifiers.txt
## Copy target_genes_gid_tid_pid.txt file to local
scp u1240855@kingspeak.chpc.utah.edu:~/spermseq/twinstrand_targets/target_gene_list_gtf_identifiers.txt ~/Google\ Drive/Quinlan\ Lab\ -\ PhD/Projects/spermseq/twinstrand_target_gene_set
Use R to obtain key GTF fields for each gene: gene_name, gene_id, transcript_id, protein_id, and exon_number.
## Read in the file
df <- read.table("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/twinstrand_target_gene_set/target_gene_list_gtf_identifiers.txt")
## Go through each row and get the gene_id, gene_name, transcript_id, protein_id, exon_number, and exon_id
x <- df[1,]
ids <- rbindlist(apply(df, 1, function(x) {
fields <- (strsplit(x, split = "|", fixed = TRUE))[[1]]
fields <- fields[grep(paste(c("gene_name", "gene_id", "transcript_id", "protein_id", "exon_number"), collapse = "|"), fields)] %>%
strsplit(., split = "=", fixed = TRUE) %>% unlist()
return(data.table("gene_name"=fields[8],
"gene_id"=fields[2],
"transcript_id"=fields[4],
"protein_id"=fields[10],
"exon_number"=fields[6]))
}))
write.table(x = ids, file = "~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/twinstrand_target_gene_set/target_gene_list_ids.txt", quote = FALSE, sep = "\t", row.names = FALSE)
Obtain GTEx TPM data using gene_ids belonging to the genes of interest.
## Copy the gene list ids file to kingspeak
scp ~/Google\ Drive/Quinlan\ Lab\ -\ PhD/Projects/spermseq/twinstrand_target_gene_set/target_gene_list_ids.txt u1240855@kingspeak.chpc.utah.edu:~/spermseq/twinstrand_targets
## Download master sample list to identify testis samples
cd ~/spermseq/data/GTEx/samples
wget https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt
grep -w -i "testis" GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt | awk '{print $1}' > GTEx_Analysis_v8_Annotations_SampleAttributesDS_TestisOnlySamples_Data.txt
## Download master transcript TPM data
cd ~/spermseq/data/GTEx/transcripts
wget https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz
gunzip GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz
## Obtain transcript TPM data from testis samples --> see "Get TPM data from GTEx testis samples"
cd ~/spermseq/data/GTEx/transcripts
head -n 1 GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct > header.temp ## Get the header which contains sample ids
## Run this if the file already exists: rm ~/spermseq/data/GTEx/transcripts/GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_TESTIS_transcript_tpm.gct
cat ~/spermseq/twinstrand_targets/target_gene_list_ids.txt | awk -v OFS='\t' '{print $1, $2}' | sort | uniq | grep -v "gene_name" | cut -f 2 | awk '{print "cat $HOME/spermseq/data/GTEx/transcripts/GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct | grep "$1" "}' | bash >> ~/spermseq/data/GTEx/transcripts/testis.data.temp
cat header.temp testis.data.temp > GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_TESTIS_transcript_tpm.gct
## Remove files
rm header.temp
rm testis.data.temp
## Copy final file to local directory
scp u1240855@kingspeak.chpc.utah.edu:~/spermseq/data/GTEx/transcripts/GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_TESTIS_transcript_tpm.gct ~/Google\ Drive/Quinlan\ Lab\ -\ PhD/Projects/spermseq/twinstrand_target_gene_set/data
## Download appris principal isoform data
cd ~/spermseq/data/appris
wget http://apprisws.bioinfo.cnio.es/pub/current_release/datafiles/homo_sapiens/GRCh38/appris_data.principal.txt
## Copy to local directory
scp u1240855@kingspeak.chpc.utah.edu:~/spermseq/data/appris/appris_data.principal.txt ~/Google\ Drive/Quinlan\ Lab\ -\ PhD/Projects/spermseq/twinstrand_target_gene_set/data
[insert image]
knitr::opts_chunk$set(echo=TRUE)
######################################
##### Load in necessary packages #####
######################################
library(data.table)
library(dplyr)
library(ggplot2)
library(beeswarm)
library(ggbeeswarm)
library(reshape)
library(ggpubr)
library(biomaRt)
library(ensembldb)
library(EnsDb.Hsapiens.v86)
##################################
##### Specify file locations #####
##################################
appris_file <- "~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/twinstrand_target_gene_set/data/appris_data.principal.txt"
gtf_file <- "~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/twinstrand_target_gene_set/data/Homo_sapiens.GRCh38.102.CDS.gtf"
GTEx_file <- "~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/twinstrand_target_gene_set/data/GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_TESTIS_transcript_tpm.gct"
GTEx_samples <- "~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/twinstrand_target_gene_set/data/GTEx_testis_samples.txt"
#########################################
##### Specify the genes of interest #####
#########################################
genes <- c("WT1", "CDC14A", "DMRT1", "KLHL10", "M1AP", "MEI1", "STAG3",
"SYCP2", "SYCP3", "TEX11", "USP26", "PKD1", "AR", "AKT3", "APOA1",
"APC", "ATM", "BRAF", "BRCA1", "BRCA2", "CBL", "CDKN2A", "CTNNB1", "DICER1",
"DNMT3A", "ERCC1", "ESR1", "FANCM", "FGFR2", "FGFR3", "H3-3A", "H3-3B", "HRAS",
"KIT", "KRAS", "LIG4", "MAP2K1", "MAP2K2", "MLH1", "MLH3", "MSH5", "NR5A1", "NF1",
"PMS2", "PPM1D", "PRKACA", "PTCH1", "PTPN11", "RAG1", "RAF1", "RB1", "RET", "SMAD4",
"SOS1", "STAT3", "STK11", "TEX14", "TEX15", "TSHR", "VHL", "XPA", "XRCC1")
moderate_genes <- c("USP26", "TEX14", "SYCP2", "STAG3", "PKD1", "M1AP", "KLHL10", "DMRT1", "CDC14A")
moderate_gene_ids <- c("ENSG00000134588", "ENSG00000121101", "ENSG00000196074", "ENSG00000066923", "ENSG00000008710", "ENSG00000159374", "ENSG00000161594", "ENSG00000137090", "ENSG00000079335")
genes <- genes[-which(genes %in% moderate_genes)]
genes <- c(genes, "USP9Y", "DDX3Y", "PRY")
source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/testis_GTEx_samples.R")
testis_GTEx_tpm <- testis_GTEx_samples(GTEx_file = GTEx_file, GTEx_samples = GTEx_samples)
head(testis_GTEx_tpm)
See APPRIS for column details… SANITY CHECK: This steps check that each gene of interest has an APPRIS isoform tag. If not, it will obtain the gene_id from biomaRt using the gene_name. No transcript_ids or CCDS_id will be obtained and the ‘status’ column will be flagged with a “minor” APPRIS designation.
source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/appris_manipulation.R")
appris_df <- appris_manipulation(appris_file = appris_file, genes = genes)
## The following genes do not have any APPRIS isoform information:
head(appris_df)
source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/appris_manipulation.R")
appris_df_tpm <- get_appris_tpm(appris_df = appris_df, testis_GTEx_tpm = testis_GTEx_tpm)
appris_df_tpm <- data.table(do.call("rbind", appris_df_tpm)) %>% `colnames<-`(c("gene_name", "gene_id", "transcript_id", "CCDS", "status", "avg_GTEx_TPM"))
appris_df_tpm
## See which appris isoforms did not have GTEx values
#genes[! genes %in% unique(appris_df_tpm$gene_name)]
These are transcripts that are not identified as PRINCIPAL isoforms by APPRIS, but have avg TPM values across all testis samples > the minimum value of the APPRIS-defined PRINCIPAL isoforms.
NOTE: The GTEx-only transcript must have an avg TPM > 1 across all testis samples to be included in the dataset
source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/gtex_transcript_tpm_manipulation.R")
appris_gtex_transcript_tpm <- gtex_transcript_tpm_manipulation(appris_df_tpm = appris_df_tpm, testis_GTEx_tpm = testis_GTEx_tpm, moderate_gene_ids = moderate_gene_ids)
# ## Run this command to filter out appris exons not reliably defined as principal exons
# appris_gtex_transcript_tpm <- appris_gtex_transcript_tpm[!(status %in% c("PRINCIPAL:4", "PRINCIPAL:5", "ALTERNATIVE:1", "ALTERNATIVE:2"))]
# file.remove("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/gtf_df.Rdata")
appris_gtex_transcript_tpm
## This is the longest step, see if the data already exists
if (file.exists("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/gtf_df.Rdata")) {
load("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/gtf_df.Rdata")
}
if (!file.exists("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/gtf_df.Rdata")) {
## Read in the gtf file
gtf <- fread(gtf_file, header=FALSE) %>% .[,c(1,4,5,9)] %>% `colnames<-` (c("chr", "start", "stop", "info"))
## Get exon positions
source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/gtf_manipulation.R")
gtf_df <- gtf_manipulation(gtf = gtf, df = appris_gtex_transcript_tpm)
head(gtf_df)
save.image("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/gtf_df.Rdata")
## Write output to text file
write.table(x = gtf_df, file = "~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/twinstrand_target_gene_set/output/appris_gtex_gtf_testis.txt",
quote = FALSE, sep = "\t", row.names = FALSE)
}
Schematic depicting how different exons across isoforms of a gene are merged.
## Output the warning messages of the pfam domain exon analysis
if (file.exists("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/get_pfam_log.txt")) {
file.remove("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/get_pfam_log.txt")
}
## [1] TRUE
## Capture output to a file
sink_file <- file("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/get_pfam_log.txt", open = "wt")
sink(sink_file)
sink(sink_file, type = "message")
## Run the function
source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/get_pfam.R")
pfam_df <- get_pfam(df = gtf_df)
## [1] "ENST00000263826" "AKT3"
## [1] "ENST00000508376" "APC"
## [1] "ENST00000257430" "APC"
## [1] "ENST00000375320" "APOA1"
## [1] "ENST00000375323" "APOA1"
## [1] "ENST00000359492" "APOA1"
## [1] "ENST00000236850" "APOA1"
## [1] "ENST00000374690" "AR"
## [1] "ENST00000452508" "ATM"
## [1] "ENST00000278616" "ATM"
## [1] "ENST00000288602" "BRAF"
## [1] "ENST00000496384" "BRAF"
## Warning: Could not find a CDS whith the expected length for protein:
## 'ENSP00000419060'. The returned genomic coordinates might thus not be correct
## for this protein.
## [1] "ENST00000357654" "BRCA1"
## [1] "ENST00000471181" "BRCA1"
## [1] "ENST00000352993" "BRCA1"
## [1] "ENST00000461574" "BRCA1"
## [1] "gene: BRCA1 with transcript: ENST00000461574 do not have any pfam information"
## [1] "ENST00000544455" "BRCA2"
## [1] "ENST00000380152" "BRCA2"
## [1] "ENST00000634840" "CBL"
## [1] "ENST00000264033" "CBL"
## [1] "ENST00000634586" "CBL"
## [1] "ENST00000498124" "CDKN2A"
## [1] "gene: CDKN2A with transcript: ENST00000498124 do not have any pfam information"
## [1] "ENST00000304494" "CDKN2A"
## [1] "gene: CDKN2A with transcript: ENST00000304494 do not have any pfam information"
## [1] "ENST00000479692" "CDKN2A"
## [1] "gene: CDKN2A with transcript: ENST00000479692 do not have any pfam information"
## [1] "ENST00000497750" "CDKN2A"
## [1] "gene: CDKN2A with transcript: ENST00000497750 do not have any pfam information"
## [1] "ENST00000578845" "CDKN2A"
## [1] "gene: CDKN2A with transcript: ENST00000578845 do not have any pfam information"
## [1] "ENST00000579755" "CDKN2A"
## [1] "ENST00000349496" "CTNNB1"
## [1] "ENST00000450969" "CTNNB1"
## [1] "gene: CTNNB1 with transcript: ENST00000450969 do not have any pfam information"
## [1] "ENST00000405570" "CTNNB1"
## [1] "ENST00000433400" "CTNNB1"
## [1] "gene: CTNNB1 with transcript: ENST00000433400 do not have any pfam information"
## [1] "ENST00000396183" "CTNNB1"
## [1] "ENST00000396185" "CTNNB1"
## [1] "ENST00000431914" "CTNNB1"
## [1] "gene: CTNNB1 with transcript: ENST00000431914 do not have any pfam information"
## [1] "ENST00000441708" "CTNNB1"
## [1] "gene: CTNNB1 with transcript: ENST00000441708 do not have any pfam information"
## [1] "ENST00000453024" "CTNNB1"
## [1] "ENST00000336079" "DDX3Y"
## [1] "ENST00000360160" "DDX3Y"
## [1] "ENST00000527414" "DICER1"
## [1] "ENST00000343455" "DICER1"
## [1] "ENST00000393063" "DICER1"
## [1] "ENST00000526495" "DICER1"
## [1] "ENST00000532939" "DICER1"
## Warning: Could not find a CDS whith the expected length for protein:
## 'ENSP00000452556'. The returned genomic coordinates might thus not be correct
## for this protein.
## [1] "ENST00000556045" "DICER1"
## [1] "ENST00000264709" "DNMT3A"
## [1] "ENST00000321117" "DNMT3A"
## [1] "ENST00000402667" "DNMT3A"
## [1] "ENST00000380746" "DNMT3A"
## [1] "ENST00000300853" "ERCC1"
## [1] "ENST00000589165" "ERCC1"
## [1] "ENST00000013807" "ERCC1"
## [1] "ENST00000340192" "ERCC1"
## [1] "ENST00000423698" "ERCC1"
## [1] "ENST00000587888" "ERCC1"
## [1] "gene: ERCC1 with transcript: ENST00000587888 do not have any pfam information"
## [1] "ENST00000206249" "ESR1"
## [1] "ENST00000338799" "ESR1"
## [1] "ENST00000440973" "ESR1"
## [1] "ENST00000443427" "ESR1"
## [1] "ENST00000267430" "FANCM"
## [1] "ENST00000554809" "FANCM"
## Warning: Could not find a CDS whith the expected length for protein:
## 'ENSP00000450632'. The returned genomic coordinates might thus not be correct
## for this protein.
## [1] "ENST00000557110" "FANCM"
## [1] "ENST00000351936" "FGFR2"
## [1] "ENST00000346997" "FGFR2"
## [1] "ENST00000358487" "FGFR2"
## [1] "ENST00000457416" "FGFR2"
## [1] "ENST00000369061" "FGFR2"
## [1] "ENST00000613048" "FGFR2"
## [1] "ENST00000613324" "FGFR2"
## Warning: Could not find a CDS whith the expected length for protein:
## 'ENSP00000481464'. The returned genomic coordinates might thus not be correct
## for this protein.
## [1] "ENST00000638709" "FGFR2"
## [1] "gene: FGFR2 with transcript: ENST00000638709 do not have any pfam information"
## [1] "ENST00000412135" "FGFR3"
## [1] "ENST00000440486" "FGFR3"
## [1] "ENST00000260795" "FGFR3"
## [1] "ENST00000352904" "FGFR3"
## [1] "ENST00000481110" "FGFR3"
## [1] "ENST00000366815" "H3-3A"
## [1] "ENST00000366816" "H3-3A"
## [1] "ENST00000366813" "H3-3A"
## [1] "ENST00000366814" "H3-3A"
## [1] "ENST00000589599" "H3-3B"
## [1] "ENST00000587560" "H3-3B"
## [1] "ENST00000586607" "H3-3B"
## [1] "ENST00000254810" "H3-3B"
## [1] "ENST00000586270" "H3-3B"
## Warning: Could not find a CDS whith the expected length for protein:
## 'ENSP00000465403'. The returned genomic coordinates might thus not be correct
## for this protein.
## [1] "ENST00000591890" "H3-3B"
## [1] "ENST00000592643" "H3-3B"
## [1] "ENST00000451590" "HRAS"
## [1] "ENST00000311189" "HRAS"
## [1] "ENST00000397596" "HRAS"
## [1] "ENST00000397594" "HRAS"
## [1] "ENST00000412167" "KIT"
## [1] "ENST00000288135" "KIT"
## [1] "ENST00000311936" "KRAS"
## [1] "ENST00000256078" "KRAS"
## [1] "ENST00000611712" "LIG4"
## [1] "ENST00000405925" "LIG4"
## [1] "ENST00000442234" "LIG4"
## [1] "ENST00000307102" "MAP2K1"
## [1] "ENST00000566326" "MAP2K1"
## [1] "ENST00000262948" "MAP2K2"
## [1] "ENST00000394867" "MAP2K2"
## [1] "ENST00000401548" "MEI1"
## [1] "gene: MEI1 with transcript: ENST00000401548 do not have any pfam information"
## [1] "ENST00000423900" "MEI1"
## [1] "gene: MEI1 with transcript: ENST00000423900 do not have any pfam information"
## [1] "ENST00000231790" "MLH1"
## [1] "ENST00000355774" "MLH3"
## [1] "ENST00000380968" "MLH3"
## [1] "ENST00000553713" "MLH3"
## Warning: Could not find a CDS whith the expected length for protein:
## 'ENSP00000451130'. The returned genomic coordinates might thus not be correct
## for this protein.
## [1] "ENST00000554697" "MLH3"
## [1] "gene: MLH3 with transcript: ENST00000554697 do not have any pfam information"
## [1] "ENST00000375703" "MSH5"
## [1] "ENST00000375755" "MSH5"
## [1] "ENST00000375750" "MSH5"
## [1] "ENST00000425703" "MSH5"
## [1] "gene: MSH5 with transcript: ENST00000425703 do not have any pfam information"
## [1] "ENST00000429846" "MSH5"
## [1] "ENST00000463144" "MSH5"
## [1] "gene: MSH5 with transcript: ENST00000463144 do not have any pfam information"
## [1] "ENST00000358273" "NF1"
## [1] "ENST00000356175" "NF1"
## [1] "ENST00000471572" "NF1"
## [1] "gene: NF1 with transcript: ENST00000471572 do not have any pfam information"
## [1] "ENST00000373588" "NR5A1"
## [1] "ENST00000265849" "PMS2"
## [1] "ENST00000305921" "PPM1D"
## [1] "ENST00000392995" "PPM1D"
## [1] "ENST00000308677" "PRKACA"
## [1] "ENST00000303728" "PRY"
## [1] "gene: PRY with transcript: ENST00000303728 do not have any pfam information"
## [1] "ENST00000429896" "PTCH1"
## [1] "ENST00000437951" "PTCH1"
## [1] "ENST00000421141" "PTCH1"
## [1] "ENST00000375274" "PTCH1"
## [1] "ENST00000430669" "PTCH1"
## [1] "ENST00000331920" "PTCH1"
## [1] "ENST00000418258" "PTCH1"
## [1] "ENST00000375290" "PTCH1"
## [1] "ENST00000551630" "PTCH1"
## [1] "gene: PTCH1 with transcript: ENST00000551630 do not have any pfam information"
## [1] "ENST00000635625" "PTPN11"
## [1] "ENST00000351677" "PTPN11"
## [1] "ENST00000251849" "RAF1"
## [1] "ENST00000442415" "RAF1"
## [1] "ENST00000299440" "RAG1"
## [1] "ENST00000267163" "RB1"
## [1] "ENST00000355710" "RET"
## [1] "ENST00000638465" "RET"
## [1] "gene: RET with transcript: ENST00000638465 do not have any pfam information"
## [1] "ENST00000342988" "SMAD4"
## [1] "ENST00000398417" "SMAD4"
## [1] "ENST00000611848" "SMAD4"
## Warning: Could not find a CDS whith the expected length for protein:
## 'ENSP00000478613'. The returned genomic coordinates might thus not be correct
## for this protein.
## [1] "ENST00000395038" "SOS1"
## [1] "ENST00000404395" "STAT3"
## [1] "ENST00000588969" "STAT3"
## [1] "ENST00000264657" "STAT3"
## [1] "ENST00000326873" "STK11"
## [1] "ENST00000586243" "STK11"
## [1] "ENST00000392927" "SYCP3"
## [1] "ENST00000266743" "SYCP3"
## [1] "ENST00000392924" "SYCP3"
## [1] "ENST00000374333" "TEX11"
## [1] "ENST00000395889" "TEX11"
## [1] "ENST00000344304" "TEX11"
## [1] "ENST00000374320" "TEX11"
## [1] "ENST00000638951" "TEX15"
## [1] "gene: TEX15 with transcript: ENST00000638951 do not have any pfam information"
## [1] "ENST00000256246" "TEX15"
## [1] "ENST00000298171" "TSHR"
## [1] "ENST00000541158" "TSHR"
## [1] "ENST00000338981" "USP9Y"
## [1] "ENST00000453031" "USP9Y"
## [1] "gene: USP9Y with transcript: ENST00000453031 do not have any pfam information"
## [1] "ENST00000256474" "VHL"
## [1] "ENST00000345392" "VHL"
## [1] "ENST00000332351" "WT1"
## [1] "ENST00000639563" "WT1"
## [1] "gene: WT1 with transcript: ENST00000639563 do not have any pfam information"
## [1] "ENST00000379077" "WT1"
## [1] "ENST00000379079" "WT1"
## [1] "ENST00000530998" "WT1"
## [1] "ENST00000375128" "XPA"
## [1] "ENST00000262887" "XRCC1"
## [1] "ENST00000543982" "XRCC1"
## Output the data
pfam_df
## Revert output back to the console
sink(type = "message")
sink()
Schematic depicting how different exons across isoforms of a gene are merged.
source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/merge_exons.R")
merged_exons <- merge_exons(df = pfam_df)
## Get the total panel space (exonic)
total <- sum(merged_exons$exon_size)
## Calculate proportion of panel space taken up by each gene
merged_exons$proportion_total <- merged_exons$total_nucleotides/total*100
#head(merged_exons)
## Remove exons found below threshold of total proportion of transcripts for a given gene:
source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/transcript_proportion_filter.R")
merged_exons <- transcript_proportion_filter(merged_exons = merged_exons)
## Write the output file
write.table(x = merged_exons, file = paste0("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/twinstrand_target_gene_set/output/merged_exons_.txt"),
quote = FALSE, sep = "\t", row.names = FALSE)
summary(merged_exons$exon_size)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.0 92.0 125.0 181.3 164.0 7313.0
## Factor the data so that it plots the x-axis in an order
merged_exons$x_axis <- paste0(merged_exons$gene_name, " (n = " , merged_exons$total_exons, " exons; avg = ", merged_exons$avg_exon_size, ")")
merged_exons$x_axis <- factor(merged_exons$x_axis, levels = unique(merged_exons$x_axis[order(merged_exons$total_nucleotides, decreasing = TRUE)]))
## Generate scatter plot --> Exons (y) per gene (x)
p1 <- ggplot(data = merged_exons) + geom_point(aes(x = x_axis, y = exon_size, color = transcript_num)) +
theme(axis.ticks.x = element_blank(), axis.text.x = element_text(angle = 75, hjust = 1)) +
labs(x = "Gene", y = paste0("Exon Size"), color="# of Transcripts") + scale_y_log10() +
scale_color_gradient(low = "grey", high = "red")
## Show proportion of panel space taken up by each gene
p2 <- ggplot(data = merged_exons) + geom_line(aes(x = x_axis, y = proportion_total, group = 1), color = "Blue") +
theme(axis.ticks.x = element_blank(), axis.text.x = element_blank(), axis.title.x = element_blank()) +
labs(y = paste0("Proportion of", "\n", "Total (%)", "\n")) +
annotate(geom = "text", x = 40, y = 5, label = paste0("Total = ", sum(merged_exons$exon_size), " nucleotides")) +
geom_vline(xintercept = 9.5, linetype = 2, color = "red")
ggarrange(p2, p1, nrow=2, ncol=1, common.legend = TRUE, legend = "right", heights = c(0.5, 2))
## Download [3D Hotspots](http://www.3dhotspots.org/#/home) dataset and manually save as a txt file
wget http://www.3dhotspots.org/files/3d_hotspots.xls
## Download [Cancer Hotspots](https://www.cancerhotspots.org/#/home) MAF file
cd ~/spermseq/data/hotspots
wget http://download.cbioportal.org/cancerhotspots/cancerhotspots.v2.maf.gz
zless cancerhotspots.v2.maf.gz | grep -v "#" | grep -w "protein_coding" > cancerhotspots.v2.maf
zless cancerhotspots.v2.maf.gz | grep -v "#" | head -n 1 > cancerhotspots.v2.maf.header
cat cancerhotspots.v2.maf.header cancerhotspots.v2.maf > cancerhotspots.v2.proteincoding.maf
cat cancerhotspots.v2.proteincoding.maf | cut -f 1,2,5,6,7,38 > cancerhotspots.v2.proteincoding.columns.maf
## Copy to local directory
scp u1240855@kingspeak.chpc.utah.edu:~/spermseq/data/hotspots/cancerhotspots.v2.proteincoding.columns.maf ~/Google\ Drive/Quinlan\ Lab\ -\ PhD/Projects/spermseq/twinstrand_target_gene_set/data
## Download the liftover file to convert hg19 coordinates from MAF file to hg38
wget https://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/hg19ToHg38.over.chain.gz
gunzip hg19ToHg38.over.chain.gz > hg19ToHg38.over.chain
scp u1240855@kingspeak.chpc.utah.edu:~/spermseq/data/hotspots/hg19ToHg38.over.chain ~/Google\ Drive/Quinlan\ Lab\ -\ PhD/Projects/spermseq/twinstrand_target_gene_set/data
## Exons with 3d hotspot mutations
source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/format_hotspot.R")
merged_exons_3d_hotspot <- format_3D_hotspot(hotspot_file = "~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/twinstrand_target_gene_set/data/3d_hotspots_gao.txt",
genes = genes, merged_exons = merged_exons)
## Exons with snp/onp/ins/del hotspot mutations
source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/format_hotspot.R")
hotspot_file <- "~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/twinstrand_target_gene_set/data/cancerhotspots.v2.proteincoding.columns.maf"
merged_exons_hotspot <- format_hotspot(hotspot_file = hotspot_file, genes = genes, merged_exons_3d_hotspot = merged_exons_3d_hotspot)
## [1] "CDKN2A"
## [1] "multiple_transcripts"
merged_exons_hotspot
## Load in necessary packages
library(BSgenome.Hsapiens.UCSC.hg38)
library(BSgenome)
library(GenomicRanges)
## Define GetGC function
GetGC <- function(bsgenome, gr){
seqs <- BSgenome::getSeq(bsgenome, gr)
return(as.numeric(Biostrings::letterFrequency(x = seqs, letters = "GC", as.prob = TRUE)))
}
## Get the IRanges
ranges <- IRanges(start = merged_exons_hotspot$start, end = merged_exons_hotspot$stop)
## Get GRanges object
gr <- GRanges(seqnames = paste0("chr", merged_exons_hotspot$chr), ranges=ranges)
## Get the gc content of each sequence
merged_exons_hotspot$gc_content <- GetGC(bsgenome = BSgenome.Hsapiens.UCSC.hg38, gr = gr)
write.table(x = merged_exons_hotspot, file = "~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/twinstrand_target_gene_set/output/merged_exons_final.txt",
sep = "\t", quote = FALSE)
## Save the data
save.image("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/final.Rdata")
## Filter out exons/rows that have NA values in pfam_id column
merged_exons_pfam <- na.omit(merged_exons_hotspot)
merged_exons_pfam_hotspot <- merged_exons_pfam[hotspot_3d_exon_num > 0 | hotspot_exon_num > 0]
## Calculate proportion of panel space taken up by each gene
total <- sum(merged_exons_pfam_hotspot$exon_size)
merged_exons_pfam_hotspot <- rbindlist(lapply(unique(merged_exons_pfam_hotspot$gene_id), function(x, merged_exons_pfam_hotspot) {
temp <- merged_exons_pfam_hotspot[gene_id == x]
temp$total_nucleotides <- sum(temp$exon_size)
temp$total_exons <- nrow(temp)
temp$transcript_num <- (temp$transcript_id %>% strsplit(., split = " | "))[[1]] %>% unique() %>% length()
temp$avg_exon_size <- round(mean(temp$exon_size), digits = 2)
return(temp)
}, merged_exons_pfam_hotspot=merged_exons_pfam_hotspot))
merged_exons_pfam_hotspot$proportion_total <- merged_exons_pfam_hotspot$total_nucleotides/total*100
source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/plot_df.R")
p <- plot_df(df = merged_exons_pfam_hotspot, genes = genes)
p
## Filter out exons/rows that have NA values in pfam_id column
merged_exons_pfam <- na.omit(merged_exons_hotspot)
## Calculate proportion of panel space taken up by each gene
total <- sum(merged_exons_pfam$exon_size)
merged_exons_pfam <- rbindlist(lapply(unique(merged_exons_pfam$gene_id), function(x, merged_exons_pfam) {
temp <- merged_exons_pfam[gene_id == x]
temp$total_nucleotides <- sum(temp$exon_size)
temp$total_exons <- nrow(temp)
temp$transcript_num <- (temp$transcript_id %>% strsplit(., split = " | "))[[1]] %>% unique() %>% length()
temp$avg_exon_size <- round(mean(temp$exon_size), digits = 2)
return(temp)
}, merged_exons_pfam=merged_exons_pfam))
merged_exons_pfam$proportion_total <- merged_exons_pfam$total_nucleotides/total*100
source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/plot_df.R")
p <- plot_df(df = merged_exons_pfam, genes = genes)
p
## Subset the final dataset
final <- merged_exons_pfam[transcript_proportion >= 0.2]
## Generate plot
source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/plot_df.R")
p <- plot_df(df = final, genes = genes)
p
## Subset the final dataset
final <- merged_exons_pfam[transcript_proportion >= 0.4]
## Generate plot
source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/plot_df.R")
p <- plot_df(df = final, genes = genes)
p
## Subset the final dataset
final <- merged_exons_pfam[transcript_proportion >= 0.5]
## Generate plot
source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/plot_df.R")
p <- plot_df(df = final, genes = genes)
p
## Subset the final dataset
final <- merged_exons_pfam[transcript_proportion >= 0.6]
## Generate plot
source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/plot_df.R")
p <- plot_df(df = final, genes = genes)
p
## Subset the final dataset
final <- merged_exons_pfam[transcript_proportion >= 0.8]
## Generate plot
source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/plot_df.R")
p <- plot_df(df = final, genes = genes)
p
## Subset the final dataset
final <- merged_exons_pfam[transcript_proportion == 1]
## Generate plot
source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/plot_df.R")
p <- plot_df(df = final, genes = genes)
p